#ifndef NPY_SIMD
    #error "Not a standalone header"
#endif
#include <lsxintrin.h>
#ifndef _NPY_SIMD_LSX_MISC_H
#define _NPY_SIMD_LSX_MISC_H

// vector with zero lanes
#define npyv_zero_u8()  __lsx_vldi(0)
#define npyv_zero_s8()  __lsx_vldi(0)
#define npyv_zero_u16() __lsx_vldi(0)
#define npyv_zero_s16() __lsx_vldi(0)
#define npyv_zero_u32() __lsx_vldi(0)
#define npyv_zero_s32() __lsx_vldi(0)
#define npyv_zero_u64() __lsx_vldi(0)
#define npyv_zero_s64() __lsx_vldi(0)
#define npyv_zero_f32() (__m128)__lsx_vldi(0)
#define npyv_zero_f64() (__m128d)__lsx_vldi(0)

// vector with a specific value set to all lanes
#define npyv_setall_u8(VAL)  __lsx_vreplgr2vr_b((unsigned char)(VAL))
#define npyv_setall_s8(VAL)  __lsx_vreplgr2vr_b((signed char)(VAL))
#define npyv_setall_u16(VAL) __lsx_vreplgr2vr_h((unsigned short)(VAL))
#define npyv_setall_s16(VAL) __lsx_vreplgr2vr_h((signed short)(VAL))
#define npyv_setall_u32(VAL) __lsx_vreplgr2vr_w((unsigned int)(VAL))
#define npyv_setall_s32(VAL) __lsx_vreplgr2vr_w((signed int)(VAL))
#define npyv_setall_u64(VAL) __lsx_vreplgr2vr_d((unsigned long long)(VAL))
#define npyv_setall_s64(VAL) __lsx_vreplgr2vr_d((long long)(VAL))
#define npyv_setall_f32(VAL) (__m128)(v4f32){VAL, VAL, VAL, VAL}
#define npyv_setall_f64(VAL) (__m128d)(v2f64){VAL, VAL}

/**
 * vector with specific values set to each lane and
 * set a specific value to all remained lanes
 *
 * Args that generated by NPYV__SET_FILL_* not going to expand if
 * _mm_setr_* are defined as macros.
 */
NPY_FINLINE __m128i npyv__set_u8(
    npy_uint8 i0, npy_uint8 i1, npy_uint8 i2,  npy_uint8 i3,  npy_uint8 i4,  npy_uint8 i5,  npy_uint8 i6,  npy_uint8 i7,
    npy_uint8 i8, npy_uint8 i9, npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15)
{
    v16u8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_s8(
    npy_int8 i0, npy_int8 i1, npy_int8 i2,  npy_int8 i3,  npy_int8 i4,  npy_int8 i5,  npy_int8 i6,  npy_int8 i7,
    npy_int8 i8, npy_int8 i9, npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15)
{
    v16i8 vec = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, npy_uint16 i4, npy_uint16 i5,
                                     npy_uint16 i6, npy_uint16 i7)
{
    v8u16 vec = {i0, i1, i2, i3, i4, i5, i6, i7};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, npy_int16 i4, npy_int16 i5,
                                     npy_int16 i6, npy_int16 i7)
{
    v8i16 vec = {i0, i1, i2, i3, i4, i5, i6, i7};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3)
{
    v4u32 vec = {i0, i1, i2, i3};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3)
{
    v4i32 vec = {i0, i1, i2, i3};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_u64(npy_uint64 i0, npy_uint64 i1)
{
    v2u64 vec = {i0, i1};
    return (__m128i)vec;
}
NPY_FINLINE __m128i npyv__set_s64(npy_int64 i0, npy_int64 i1)
{
    v2i64 vec = {i0, i1};
    return (__m128i)vec;
}
NPY_FINLINE __m128 npyv__set_f32(float i0, float i1, float i2, float i3)
{
    __m128 vec = {i0, i1, i2, i3};
    return vec;
}
NPY_FINLINE __m128d npyv__set_f64(double i0, double i1)
{
    __m128d vec = {i0, i1};
    return vec;
}
#define npyv_setf_u8(FILL, ...)  npyv__set_u8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__))
#define npyv_setf_s8(FILL, ...)  npyv__set_s8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__))
#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__))
#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__))
#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__))
#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__))
#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__))
#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__))

// vector with specific values set to each lane and
// set zero to all remained lanes
#define npyv_set_u8(...)  npyv_setf_u8(0,  __VA_ARGS__)
#define npyv_set_s8(...)  npyv_setf_s8(0,  __VA_ARGS__)
#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)

// Per lane select
NPY_FINLINE __m128i npyv_select_u8(__m128i mask, __m128i a, __m128i b)
{
  return __lsx_vbitsel_v(b, a, mask);
}

NPY_FINLINE __m128 npyv_select_f32(__m128i mask, __m128 a, __m128 b)
{
  return (__m128)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask);
}
NPY_FINLINE __m128d npyv_select_f64(__m128i mask, __m128d a, __m128d b)
{
  return (__m128d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, mask);
}

#define npyv_select_s8  npyv_select_u8
#define npyv_select_u16 npyv_select_u8
#define npyv_select_s16 npyv_select_u8
#define npyv_select_u32 npyv_select_u8
#define npyv_select_s32 npyv_select_u8
#define npyv_select_u64 npyv_select_u8
#define npyv_select_s64 npyv_select_u8

// extract the first vector's lane
#define npyv_extract0_u8(A) ((npy_uint8)__lsx_vpickve2gr_bu(A, 0))
#define npyv_extract0_s8(A) ((npy_int8)__lsx_vpickve2gr_b(A, 0))
#define npyv_extract0_u16(A) ((npy_uint16)__lsx_vpickve2gr_hu(A, 0))
#define npyv_extract0_s16(A) ((npy_int16)__lsx_vpickve2gr_h(A, 0))
#define npyv_extract0_u32(A) ((npy_uint32)__lsx_vpickve2gr_wu(A, 0))
#define npyv_extract0_s32(A) ((npy_int32)__lsx_vpickve2gr_w(A, 0))
#define npyv_extract0_u64(A) ((npy_uint64)__lsx_vpickve2gr_du(A, 0))
#define npyv_extract0_s64(A) ((npy_int64)__lsx_vpickve2gr_d(A, 0))
#define npyv_extract0_f32(A) A[0]
#define npyv_extract0_f64(A) A[0]

// Reinterpret
#define npyv_reinterpret_u8_u8(X)  X
#define npyv_reinterpret_u8_s8(X)  X
#define npyv_reinterpret_u8_u16(X) X
#define npyv_reinterpret_u8_s16(X) X
#define npyv_reinterpret_u8_u32(X) X
#define npyv_reinterpret_u8_s32(X) X
#define npyv_reinterpret_u8_u64(X) X
#define npyv_reinterpret_u8_s64(X) X
#define npyv_reinterpret_u8_f32(X) (__m128i)X
#define npyv_reinterpret_u8_f64(X) (__m128i)X

#define npyv_reinterpret_s8_s8(X)  X
#define npyv_reinterpret_s8_u8(X)  X
#define npyv_reinterpret_s8_u16(X) X
#define npyv_reinterpret_s8_s16(X) X
#define npyv_reinterpret_s8_u32(X) X
#define npyv_reinterpret_s8_s32(X) X
#define npyv_reinterpret_s8_u64(X) X
#define npyv_reinterpret_s8_s64(X) X
#define npyv_reinterpret_s8_f32(X) (__m128i)X
#define npyv_reinterpret_s8_f64(X) (__m128i)X

#define npyv_reinterpret_u16_u16(X) X
#define npyv_reinterpret_u16_u8(X)  X
#define npyv_reinterpret_u16_s8(X)  X
#define npyv_reinterpret_u16_s16(X) X
#define npyv_reinterpret_u16_u32(X) X
#define npyv_reinterpret_u16_s32(X) X
#define npyv_reinterpret_u16_u64(X) X
#define npyv_reinterpret_u16_s64(X) X
#define npyv_reinterpret_u16_f32(X) (__m128i)X
#define npyv_reinterpret_u16_f64(X) (__m128i)X

#define npyv_reinterpret_s16_s16(X) X
#define npyv_reinterpret_s16_u8(X)  X
#define npyv_reinterpret_s16_s8(X)  X
#define npyv_reinterpret_s16_u16(X) X
#define npyv_reinterpret_s16_u32(X) X
#define npyv_reinterpret_s16_s32(X) X
#define npyv_reinterpret_s16_u64(X) X
#define npyv_reinterpret_s16_s64(X) X
#define npyv_reinterpret_s16_f32(X) (__m128i)X
#define npyv_reinterpret_s16_f64(X) (__m128i)X

#define npyv_reinterpret_u32_u32(X) X
#define npyv_reinterpret_u32_u8(X)  X
#define npyv_reinterpret_u32_s8(X)  X
#define npyv_reinterpret_u32_u16(X) X
#define npyv_reinterpret_u32_s16(X) X
#define npyv_reinterpret_u32_s32(X) X
#define npyv_reinterpret_u32_u64(X) X
#define npyv_reinterpret_u32_s64(X) X
#define npyv_reinterpret_u32_f32(X) (__m128i)X
#define npyv_reinterpret_u32_f64(X) (__m128i)X

#define npyv_reinterpret_s32_s32(X) X
#define npyv_reinterpret_s32_u8(X)  X
#define npyv_reinterpret_s32_s8(X)  X
#define npyv_reinterpret_s32_u16(X) X
#define npyv_reinterpret_s32_s16(X) X
#define npyv_reinterpret_s32_u32(X) X
#define npyv_reinterpret_s32_u64(X) X
#define npyv_reinterpret_s32_s64(X) X
#define npyv_reinterpret_s32_f32(X) (__m128i)X
#define npyv_reinterpret_s32_f64(X) (__m128i)X

#define npyv_reinterpret_u64_u64(X) X
#define npyv_reinterpret_u64_u8(X)  X
#define npyv_reinterpret_u64_s8(X)  X
#define npyv_reinterpret_u64_u16(X) X
#define npyv_reinterpret_u64_s16(X) X
#define npyv_reinterpret_u64_u32(X) X
#define npyv_reinterpret_u64_s32(X) X
#define npyv_reinterpret_u64_s64(X) X
#define npyv_reinterpret_u64_f32(X) (__m128i)X
#define npyv_reinterpret_u64_f64(X) (__m128i)X

#define npyv_reinterpret_s64_s64(X) X
#define npyv_reinterpret_s64_u8(X)  X
#define npyv_reinterpret_s64_s8(X)  X
#define npyv_reinterpret_s64_u16(X) X
#define npyv_reinterpret_s64_s16(X) X
#define npyv_reinterpret_s64_u32(X) X
#define npyv_reinterpret_s64_s32(X) X
#define npyv_reinterpret_s64_u64(X) X
#define npyv_reinterpret_s64_f32(X) (__m128i)X
#define npyv_reinterpret_s64_f64(X) (__m128i)X

#define npyv_reinterpret_f32_f32(X) X
#define npyv_reinterpret_f32_u8(X)  (__m128)X
#define npyv_reinterpret_f32_s8(X)  (__m128)X
#define npyv_reinterpret_f32_u16(X) (__m128)X
#define npyv_reinterpret_f32_s16(X) (__m128)X
#define npyv_reinterpret_f32_u32(X) (__m128)X
#define npyv_reinterpret_f32_s32(X) (__m128)X
#define npyv_reinterpret_f32_u64(X) (__m128)X
#define npyv_reinterpret_f32_s64(X) (__m128)X
#define npyv_reinterpret_f32_f64(X) (__m128)X

#define npyv_reinterpret_f64_f64(X) X
#define npyv_reinterpret_f64_u8(X)  (__m128d)X
#define npyv_reinterpret_f64_s8(X)  (__m128d)X
#define npyv_reinterpret_f64_u16(X) (__m128d)X
#define npyv_reinterpret_f64_s16(X) (__m128d)X
#define npyv_reinterpret_f64_u32(X) (__m128d)X
#define npyv_reinterpret_f64_s32(X) (__m128d)X
#define npyv_reinterpret_f64_u64(X) (__m128d)X
#define npyv_reinterpret_f64_s64(X) (__m128d)X
#define npyv_reinterpret_f64_f32(X) (__m128d)X

// Only required by AVX2/AVX512
#define npyv_cleanup() ((void)0)

#endif
